{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# IBM HR analytics"
      ],
      "metadata": {
        "id": "FntdtXUtnWlw"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "try:\n",
        "    import evidently\n",
        "except:\n",
        "    !pip install git+https://github.com/evidentlyai/evidently.git"
      ],
      "outputs": [],
      "metadata": {
        "id": "xxiKqQq3nfQf"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "!pip install catboost"
      ],
      "outputs": [],
      "metadata": {
        "id": "2GCt_DtRn4_f"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#Import of the particular libraries \n",
        "import math\n",
        "import numpy as np\n",
        "import os \n",
        "import pandas as pd"
      ],
      "outputs": [],
      "metadata": {
        "id": "yI8tP92anWl6"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn import metrics\n",
        "from catboost import CatBoostClassifier"
      ],
      "outputs": [],
      "metadata": {
        "id": "XIewNLK-nWmA"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "from evidently.report import Report\n",
        "from evidently.metric_preset import ClassificationPreset\n",
        "from evidently.pipeline.column_mapping import ColumnMapping"
      ],
      "outputs": [],
      "metadata": {
        "id": "TSBOOaHJnWmB"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Source data"
      ],
      "metadata": {
        "id": "ZjqxxxQHnWmC"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Download this dataset https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset\n",
        "\n",
        "You can just download data from kaggle and upload it here manually or using kaggle API https://www.kaggle.com/docs/api"
      ],
      "metadata": {
        "id": "QhflIH6mqyj8"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset = pd.read_csv(\"WA_Fn-UseC_-HR-Employee-Attrition.csv\", sep = ',', header = 0)"
      ],
      "outputs": [],
      "metadata": {
        "id": "Jx8LET3inWmJ"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset.head()"
      ],
      "outputs": [],
      "metadata": {
        "id": "bJjwSq6hnWmK"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset.describe()"
      ],
      "outputs": [],
      "metadata": {
        "id": "o36K6V6RnWmY"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Feature engineering"
      ],
      "metadata": {
        "id": "74FFpCamnWmd"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "target_name = 'Attrition'"
      ],
      "outputs": [],
      "metadata": {
        "id": "SFZCUSXlnWmx"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "numerique_features = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',\n",
        "                      'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',\n",
        "                      'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating',\n",
        "                      'RelationshipSatisfaction','StockOptionLevel', 'TotalWorkingYears',\n",
        "                      'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',\n",
        "                      'YearsSinceLastPromotion', 'YearsWithCurrManager'\n",
        "                     ]"
      ],
      "outputs": [],
      "metadata": {
        "id": "CZhNoxAhnWmy"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "categorical_features = ['BusinessTravel', 'Department', 'EducationField', 'Gender',\n",
        "                        'JobRole', 'MaritalStatus', 'OverTime'\n",
        "                       ]"
      ],
      "outputs": [],
      "metadata": {
        "id": "z3y4XCJanWmz"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "processed_dataset = dataset.copy(deep = True)"
      ],
      "outputs": [],
      "metadata": {
        "id": "hrIpxNA0nWm7"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Drop constant features"
      ],
      "metadata": {
        "id": "Alt68tlMnWm9"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "processed_dataset.drop(columns = ['EmployeeCount', 'StandardHours', 'Over18'], inplace = True)"
      ],
      "outputs": [],
      "metadata": {
        "id": "_Sb2e5LDnWm_"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### BusinessTravel"
      ],
      "metadata": {
        "id": "hrwb2A_BnWnB"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "processed_dataset.BusinessTravel.value_counts()"
      ],
      "outputs": [],
      "metadata": {
        "id": "6LeZhrkCnWnB"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "business_travel_dummies = pd.get_dummies(processed_dataset.BusinessTravel, prefix = 'b_travel')\n",
        "processed_dataset = pd.concat([processed_dataset, business_travel_dummies], axis=1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "EGQMhSyXnWnC"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Department"
      ],
      "metadata": {
        "id": "9MJWixMxnWno"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset.Department.value_counts()"
      ],
      "outputs": [],
      "metadata": {
        "id": "4XTvszyKnWnq"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "department_dummies = pd.get_dummies(processed_dataset.Department, prefix = 'department')\n",
        "processed_dataset = pd.concat([processed_dataset, department_dummies], axis=1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "MAexXFJRnWnq"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### EducationField"
      ],
      "metadata": {
        "id": "J3-E0tLPnWns"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset.EducationField.value_counts()"
      ],
      "outputs": [],
      "metadata": {
        "id": "gOZUOQNgnWnt"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "edu_field_dummies = pd.get_dummies(processed_dataset.Department, prefix = 'edu_field')\n",
        "processed_dataset = pd.concat([processed_dataset, edu_field_dummies], axis=1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "FtV7j9KAnWnz"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Gender"
      ],
      "metadata": {
        "id": "c_bD6GQInWn0"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset.Gender.value_counts()"
      ],
      "outputs": [],
      "metadata": {
        "id": "XOMMyW0MnWn1"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "processed_dataset['gender_bin'] = processed_dataset.Gender.apply(\n",
        "    lambda x : 0 if x == 'Male' else 1 if x == 'Female' else -1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "FTNSgJExnWn2"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### JobRole"
      ],
      "metadata": {
        "id": "UWlOGL6NnWn5"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset.JobRole.value_counts()"
      ],
      "outputs": [],
      "metadata": {
        "id": "cpzrfDlknWn5"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "job_role_dummies = pd.get_dummies(processed_dataset.JobRole, prefix = 'job_role')\n",
        "processed_dataset = pd.concat([processed_dataset, job_role_dummies], axis=1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "bVNa1iDZnWn6"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### MaritalStatus"
      ],
      "metadata": {
        "id": "VeSXQqd_nWn7"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset.MaritalStatus.value_counts()"
      ],
      "outputs": [],
      "metadata": {
        "id": "sBlKaLgInWn7"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "marital_dummies = pd.get_dummies(processed_dataset.MaritalStatus, prefix = 'marital')\n",
        "processed_dataset = pd.concat([processed_dataset, marital_dummies], axis=1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "gYvc-xFPnWn8"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### OverTime"
      ],
      "metadata": {
        "id": "HKDhTtRGnWn9"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "dataset.OverTime.value_counts()"
      ],
      "outputs": [],
      "metadata": {
        "id": "ZAYzVnOunWn-"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "overtime_dummies = pd.get_dummies(processed_dataset.OverTime, prefix = 'overtime')\n",
        "processed_dataset = pd.concat([processed_dataset, overtime_dummies], axis=1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "pBzLh8SdnWoB"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#drop initial categorical features\n",
        "processed_dataset.drop(columns = categorical_features, inplace = True)"
      ],
      "outputs": [],
      "metadata": {
        "id": "kgODaaIknWoD"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Target"
      ],
      "metadata": {
        "id": "5sQnycssnWoE"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "processed_dataset['target'] = processed_dataset.Attrition.apply(\n",
        "    lambda x : 0 if x == 'No' else 1 if x == 'Yes' else -1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "rsA4nPLOnWoE"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Dataset generation"
      ],
      "metadata": {
        "id": "7zAVWWuanWoF"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "features = ['Age','DailyRate', 'DistanceFromHome', 'Education',\n",
        "       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',\n",
        "       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',\n",
        "       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',\n",
        "       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',\n",
        "       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',\n",
        "       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',\n",
        "       'YearsWithCurrManager', 'b_travel_Non-Travel',\n",
        "       'b_travel_Travel_Frequently', 'b_travel_Travel_Rarely',\n",
        "       'department_Human Resources', 'department_Research & Development',\n",
        "       'department_Sales', 'edu_field_Human Resources',\n",
        "       'edu_field_Research & Development', 'edu_field_Sales', 'gender_bin',\n",
        "       'job_role_Healthcare Representative', 'job_role_Human Resources',\n",
        "       'job_role_Laboratory Technician', 'job_role_Manager',\n",
        "       'job_role_Manufacturing Director', 'job_role_Research Director',\n",
        "       'job_role_Research Scientist', 'job_role_Sales Executive',\n",
        "       'job_role_Sales Representative', 'marital_Divorced', 'marital_Married',\n",
        "       'marital_Single', 'overtime_No', 'overtime_Yes',]"
      ],
      "outputs": [],
      "metadata": {
        "id": "dHFuHwq4nWoG"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Train & Holdout Test Split"
      ],
      "metadata": {
        "id": "XUeFLTo1nWoH"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "RANDOM_STATE = 1603"
      ],
      "outputs": [],
      "metadata": {
        "id": "jmbSVswHnWoH"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "train_data, test_data, train_y, test_y = train_test_split(processed_dataset[features], processed_dataset.target,\n",
        "                                                   random_state = RANDOM_STATE, test_size = 0.25,\n",
        "                                                   stratify = processed_dataset.target)"
      ],
      "outputs": [],
      "metadata": {
        "id": "EPQIQV3tnWoI"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Modeling"
      ],
      "metadata": {
        "id": "rLYYVS13nWoI"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Baseline"
      ],
      "metadata": {
        "id": "2drlTMsPnWoJ"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "rf = RandomForestClassifier(n_estimators=500, n_jobs = -1, random_state = 11)"
      ],
      "outputs": [],
      "metadata": {
        "id": "Y9uSLcPrnWoK"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "rf.fit(train_data[features], train_y)"
      ],
      "outputs": [],
      "metadata": {
        "id": "4vntlMtGnWoL"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Baseline Classification Report"
      ],
      "metadata": {
        "id": "Iu5OqlHMnWoL"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "train_probas = pd.DataFrame(rf.predict_proba(train_data[features]))\n",
        "train_probas.columns = ['no', 'yes']\n",
        "test_probas = pd.DataFrame(rf.predict_proba(test_data[features]))\n",
        "test_probas.columns = ['no', 'yes']"
      ],
      "outputs": [],
      "metadata": {
        "id": "hiG4zonBnWoM"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "train_data.reset_index(inplace=True, drop=True)\n",
        "train_data['Attrition'] = ['no' if x == 0 else 'yes' for x in train_y]\n",
        "rf_merged_train = pd.concat([train_data, train_probas], axis = 1)\n",
        "\n",
        "test_data.reset_index(inplace=True, drop=True)\n",
        "test_data['Attrition'] = ['no' if x == 0 else 'yes' for x in test_y]\n",
        "rf_merged_test = pd.concat([test_data, test_probas], axis = 1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "HWiqDB_jnWoN"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "column_mapping = ColumnMapping()\n",
        "\n",
        "column_mapping.target = 'Attrition'\n",
        "column_mapping.prediction = ['yes', 'no']\n",
        "column_mapping.pos_label = 'yes'\n",
        "\n",
        "column_mapping.numerical_features = ['Age','DailyRate', 'DistanceFromHome', 'Education',\n",
        "       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',\n",
        "       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',\n",
        "       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',\n",
        "       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',\n",
        "       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',\n",
        "       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',\n",
        "       'YearsWithCurrManager']\n",
        "\n",
        "column_mapping.categorical_features = ['b_travel_Non-Travel',\n",
        "       'b_travel_Travel_Frequently', 'b_travel_Travel_Rarely',\n",
        "       'department_Human Resources', 'department_Research & Development',\n",
        "       'department_Sales', 'edu_field_Human Resources',\n",
        "       'edu_field_Research & Development', 'edu_field_Sales', 'gender_bin',\n",
        "       'job_role_Healthcare Representative', 'job_role_Human Resources',\n",
        "       'job_role_Laboratory Technician', 'job_role_Manager',\n",
        "       'job_role_Manufacturing Director', 'job_role_Research Director',\n",
        "       'job_role_Research Scientist', 'job_role_Sales Executive',\n",
        "       'job_role_Sales Representative', 'marital_Divorced', 'marital_Married',\n",
        "       'marital_Single', 'overtime_No', 'overtime_Yes']"
      ],
      "outputs": [],
      "metadata": {
        "id": "LZIAOc5NnWoN"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "classification_performance_report = Report(metrics=[\n",
        "    ClassificationPreset(),\n",
        "])\n",
        "\n",
        "classification_performance_report.run(reference_data=rf_merged_train, current_data=rf_merged_test, column_mapping = column_mapping)\n",
        "\n",
        "classification_performance_report"
      ],
      "outputs": [],
      "metadata": {
        "id": "gkZT5QkNJlVM"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#classification_performance_report.save_html('ibm_hr_attrition_baseline_performance.html')"
      ],
      "outputs": [],
      "metadata": {
        "id": "zxEOMfatnWoP"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Better model"
      ],
      "metadata": {
        "id": "kDybWAGmnWoR"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "cat = CatBoostClassifier(random_state= 11, iterations=1600, learning_rate=0.008, class_weights = {0:1, 1:6})"
      ],
      "outputs": [],
      "metadata": {
        "id": "mxCs-_arnWoR"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "cat.fit(train_data[features], train_y)"
      ],
      "outputs": [],
      "metadata": {
        "id": "yNSEXLJjnWoS"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "train_probas = pd.DataFrame(cat.predict_proba(train_data[features]))\n",
        "train_probas.columns = ['no', 'yes']\n",
        "test_probas = pd.DataFrame(cat.predict_proba(test_data[features]))\n",
        "test_probas.columns = ['no', 'yes']"
      ],
      "outputs": [],
      "metadata": {
        "id": "ZnxOftCSnWoT"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "train_data.reset_index(inplace=True, drop=True)\n",
        "train_data['target'] = ['no' if x == 0 else 'yes' for x in train_y]\n",
        "cat_merged_train = pd.concat([train_data, train_probas], axis = 1)\n",
        "\n",
        "test_data.reset_index(inplace=True, drop=True)\n",
        "test_data['target'] = ['no' if x == 0 else 'yes' for x in test_y]\n",
        "cat_merged_test = pd.concat([test_data, test_probas], axis = 1)"
      ],
      "outputs": [],
      "metadata": {
        "id": "rEXDusS8nWoT"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "classification_performance_report = Report(metrics=[\n",
        "    ClassificationPreset(),\n",
        "])\n",
        "\n",
        "classification_performance_report.run(reference_data=cat_merged_train, current_data=cat_merged_test, column_mapping = column_mapping)\n",
        "\n",
        "classification_performance_report"
      ],
      "outputs": [],
      "metadata": {
        "id": "zsr7ZwgEISZw"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#classification_performance_report.save_html('ibm_hr_attrition_better_model_performance.html')"
      ],
      "outputs": [],
      "metadata": {
        "id": "6GPsfafFnWoU"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Models comparison"
      ],
      "metadata": {
        "id": "Kep_f9SfnWoX"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "classification_performance_report.run(reference_data=rf_merged_test, current_data=cat_merged_test, column_mapping = column_mapping)\n",
        "\n",
        "classification_performance_report"
      ],
      "outputs": [],
      "metadata": {
        "id": "SrRhNjT-nWoY"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#classification_performance_report.save_html('ibm_hr_attrition_model_comparison.html')"
      ],
      "outputs": [],
      "metadata": {
        "id": "_XoKUwQjnWoY"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Support Evidently\n",
        "Enjoyed the tutorial? Star Evidently on GitHub to contribute back! This helps us continue creating free open-source tools for the community. https://github.com/evidentlyai/evidently"
      ],
      "metadata": {}
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.8"
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}